In [1]:

    
#https://github.com/Avik-Jain/100-Days-Of-ML-Code/blob/master/Code/Day2_Simple_Linear_Regression.md



In [13]:

    
# Step 1: Data Preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('studentscores.csv')
X = dataset.iloc[ : ,   : 1 ].values
Y = dataset.iloc[ : , 1 ].values

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 1/4, random_state = 0)



In [14]:

    
print(X_train, '\n\n', Y_train)









    



[[7.8]
 [6.9]
 [1.1]
 [5.1]
 [7.7]
 [3.3]
 [8.3]
 [9.2]
 [6.1]
 [3.5]
 [2.7]
 [5.5]
 [2.7]
 [8.5]
 [2.5]
 [4.8]
 [8.9]
 [4.5]] 

 [86 76 17 47 85 42 81 88 67 30 25 60 30 75 21 54 95 41]



In [15]:

    
print(X_test, '\n\n', Y_test)









    



[[1.5]
 [3.2]
 [7.4]
 [2.5]
 [5.9]
 [3.8]
 [1.9]] 

 [20 27 69 30 62 35 24]



In [16]:

    
plt.scatter(X_train , Y_train, color ='red')









    Out[16]:





<matplotlib.collections.PathCollection at 0x7fa3778f8748>



In [17]:

    
# Step 2: Fitting Simple Linear Regression Model to the training set

from sklearn.linear_model import LinearRegression  # Ordinary least squares Linear Regression

regressor = LinearRegression()
regressor = regressor.fit(X_train, Y_train)  # Fit linear model



In [19]:

    
regressor.score(X_train, Y_train)  # Returns the coefficient of determination R^2 of the prediction.









    Out[19]:





0.9484509249326872

The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.



In [21]:

    
# Estimated coefficients for the linear regression problem. 
# If multiple targets are passed during the fit (y 2D), this is a 2D array of shape (n_targets, n_features), 
# while if only one target is passed, this is a 1D array of length n_features.
regressor.coef_









    Out[21]:





array([9.94167834])



In [24]:

    
regressor.intercept_  # Independent term in the linear model.









    Out[24]:





1.9322042531516601



In [22]:

    
# Step 3: Predecting the Result
Y_pred = regressor.predict(X_test)



In [25]:

    
print(Y_pred)









    



[16.84472176 33.74557494 75.50062397 26.7864001  60.58810646 39.71058194
 20.8213931 ]



In [26]:

    
#Step 4: Visualization
# Visualising the Training results

plt.scatter(X_train , Y_train, color = 'red')
plt.plot(X_train , regressor.predict(X_train), color ='blue')









    Out[26]:





[<matplotlib.lines.Line2D at 0x7fa377506cc0>]



In [27]:

    
# Visualizing the test results

plt.scatter(X_test , Y_test, color = 'red')
plt.plot(X_test , regressor.predict(X_test), color ='blue')









    Out[27]:





[<matplotlib.lines.Line2D at 0x7fa3774c1400>]



In [42]:

    
X_test









    Out[42]:





array([[1.5],
       [3.2],
       [7.4],
       [2.5],
       [5.9],
       [3.8],
       [1.9]])



In [43]:

    
Y_test









    Out[43]:





array([20, 27, 69, 30, 62, 35, 24])



In [12]:

    
regressor.score(X_train, Y_train)









    Out[12]:





0.9484509249326872



In [28]:

    
regressor.predict(X_train)









    Out[28]:





array([79.4772953 , 70.5297848 , 12.86805043, 52.63476378, 78.48312747,
       34.73974277, 84.44813447, 93.39564498, 62.57644212, 36.72807844,
       28.77473577, 56.61143512, 28.77473577, 86.43647014, 26.7864001 ,
       49.65226028, 90.41314147, 46.66975678])



In [40]:

    
regressor.predict(np.array([[2]]))  # o valor que vai dentro dos dois colchetes é o valor de x na reta! o retorno é o y









    Out[40]:





array([21.81556093])



In [41]:

    
regressor.predict(np.array([[5]]))









    Out[41]:





array([51.64059595])



In [45]:

    
regressor.predict(np.array([[1.5]]))  # valor na reta! Não nos pontos de treino ou teste.









    Out[45]:





array([16.84472176])

Outro estudo



In [46]:

    
# https://www.kdnuggets.com/2019/03/beginners-guide-linear-regression-python-scikit-learn.html



In [47]:

    
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline



In [48]:

    
dataset = pd.read_csv('Weather.csv')









    



/home/miky/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (7,8,18,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [49]:

    
dataset.head()









    Out[49]:







  
    
      
      STA
      Date
      Precip
      WindGustSpd
      MaxTemp
      MinTemp
      MeanTemp
      Snowfall
      PoorWeather
      YR
      ...
      FB
      FTI
      ITH
      PGT
      TSHDSBRSGF
      SD3
      RHX
      RHN
      RVG
      WTE
    
  
  
    
      0
      10001
      1942-7-1
      1.016
      NaN
      25.555556
      22.222222
      23.888889
      0
      NaN
      42
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      10001
      1942-7-2
      0
      NaN
      28.888889
      21.666667
      25.555556
      0
      NaN
      42
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      10001
      1942-7-3
      2.54
      NaN
      26.111111
      22.222222
      24.444444
      0
      NaN
      42
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      10001
      1942-7-4
      2.54
      NaN
      26.666667
      22.222222
      24.444444
      0
      NaN
      42
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      10001
      1942-7-5
      0
      NaN
      26.666667
      21.666667
      24.444444
      0
      NaN
      42
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 31 columns



In [50]:

    
dataset.shape









    Out[50]:





(119040, 31)



In [51]:

    
dataset.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119040 entries, 0 to 119039
Data columns (total 31 columns):
STA            119040 non-null int64
Date           119040 non-null object
Precip         119040 non-null object
WindGustSpd    532 non-null float64
MaxTemp        119040 non-null float64
MinTemp        119040 non-null float64
MeanTemp       119040 non-null float64
Snowfall       117877 non-null object
PoorWeather    34237 non-null object
YR             119040 non-null int64
MO             119040 non-null int64
DA             119040 non-null int64
PRCP           117108 non-null object
DR             533 non-null float64
SPD            532 non-null float64
MAX            118566 non-null float64
MIN            118572 non-null float64
MEA            118542 non-null float64
SNF            117877 non-null object
SND            5563 non-null float64
FT             0 non-null float64
FB             0 non-null float64
FTI            0 non-null float64
ITH            0 non-null float64
PGT            525 non-null float64
TSHDSBRSGF     34237 non-null object
SD3            0 non-null float64
RHX            0 non-null float64
RHN            0 non-null float64
RVG            0 non-null float64
WTE            0 non-null float64
dtypes: float64(20), int64(4), object(7)
memory usage: 28.2+ MB



In [52]:

    
dataset.describe()









    Out[52]:







  
    
      
      STA
      WindGustSpd
      MaxTemp
      MinTemp
      MeanTemp
      YR
      MO
      DA
      DR
      SPD
      ...
      FT
      FB
      FTI
      ITH
      PGT
      SD3
      RHX
      RHN
      RVG
      WTE
    
  
  
    
      count
      119040.000000
      532.000000
      119040.000000
      119040.000000
      119040.000000
      119040.000000
      119040.000000
      119040.000000
      533.000000
      532.000000
      ...
      0.0
      0.0
      0.0
      0.0
      525.000000
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      mean
      29659.435795
      37.774534
      27.045111
      17.789511
      22.411631
      43.805284
      6.726016
      15.797530
      26.998124
      20.396617
      ...
      NaN
      NaN
      NaN
      NaN
      12.085333
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      std
      20953.209402
      10.297808
      8.717817
      8.334572
      8.297982
      1.136718
      3.425561
      8.794541
      15.221732
      5.560371
      ...
      NaN
      NaN
      NaN
      NaN
      5.731328
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      min
      10001.000000
      18.520000
      -33.333333
      -38.333333
      -35.555556
      40.000000
      1.000000
      1.000000
      2.000000
      10.000000
      ...
      NaN
      NaN
      NaN
      NaN
      0.000000
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      25%
      11801.000000
      29.632000
      25.555556
      15.000000
      20.555556
      43.000000
      4.000000
      8.000000
      11.000000
      16.000000
      ...
      NaN
      NaN
      NaN
      NaN
      8.500000
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      50%
      22508.000000
      37.040000
      29.444444
      21.111111
      25.555556
      44.000000
      7.000000
      16.000000
      32.000000
      20.000000
      ...
      NaN
      NaN
      NaN
      NaN
      11.600000
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      75%
      33501.000000
      43.059000
      31.666667
      23.333333
      27.222222
      45.000000
      10.000000
      23.000000
      34.000000
      23.250000
      ...
      NaN
      NaN
      NaN
      NaN
      15.000000
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      max
      82506.000000
      75.932000
      50.000000
      34.444444
      40.000000
      45.000000
      12.000000
      31.000000
      78.000000
      41.000000
      ...
      NaN
      NaN
      NaN
      NaN
      23.900000
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

8 rows × 24 columns

And finally, let’s plot our data points on a 2-D graph to eyeball our dataset and see if we can manually find any relationship between the data using the below script :



In [53]:

    
dataset.plot(x='MinTemp', y='MaxTemp', style='o')  
plt.title('MinTemp vs MaxTemp')  
plt.xlabel('MinTemp')  
plt.ylabel('MaxTemp')  
plt.show()

Let’s check the average max temperature and once we plot it we can observe that the Average Maximum Temperature is Between Nearly 25 and 35.



In [54]:

    
plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(dataset['MaxTemp'])









    Out[54]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa3761e9a90>

Our next step is to divide the data into “attributes” and “labels”.

Attributes are the independent variables while labels are dependent variables whose values are to be predicted. In our dataset, we only have two columns. We want to predict the MaxTemp depending upon the MinTemp recorded. Therefore our attribute set will consist of the “MinTemp” column which is stored in the X variable, and the label will be the “MaxTemp” column which is stored in y variable.



In [55]:

    
X = dataset['MinTemp'].values.reshape(-1,1)
y = dataset['MaxTemp'].values.reshape(-1,1)

Next, we split 80% of the data to the training set while 20% of the data to test set using below code.

The test_size variable is where we actually specify the proportion of the test set.



In [56]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

After splitting the data into training and testing sets, finally, the time is to train our algorithm. For that, we need to import LinearRegression class, instantiate it, and call the fit() method along with our training data.



In [57]:

    
regressor = LinearRegression()  
regressor.fit(X_train, y_train) #training the algorithm









    Out[57]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

As we have discussed that the linear regression model basically finds the best value for the intercept and slope, which results in a line that best fits the data. To see the value of the intercept and slop calculated by the linear regression algorithm for our dataset, execute the following code.



In [58]:

    
#To retrieve the intercept:
print(regressor.intercept_)

#For retrieving the slope:
print(regressor.coef_)









    



[10.66185201]
[[0.92033997]]

The result should be approximately 10.66185201 and 0.92033997 respectively.

This means that for every one unit of change in Min temperature, the change in the Max temperature is about 0.92%.

Now that we have trained our algorithm, it’s time to make some predictions. To do so, we will use our test data and see how accurately our algorithm predicts the percentage score. To make predictions on the test data, execute the following script:



In [59]:

    
y_pred = regressor.predict(X_test)

Now compare the actual output values for X_test with the predicted values, execute the following script:



In [60]:

    
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df









    Out[60]:







  
    
      
      Actual
      Predicted
    
  
  
    
      0
      28.888889
      33.670351
    
    
      1
      31.111111
      30.091251
    
    
      2
      27.222222
      26.512151
    
    
      3
      28.888889
      31.113851
    
    
      4
      23.333333
      15.774852
    
    
      5
      37.222222
      30.602551
    
    
      6
      22.222222
      11.684452
    
    
      7
      35.555556
      33.670351
    
    
      8
      30.555556
      30.602551
    
    
      9
      28.888889
      32.647751
    
    
      10
      24.444444
      29.068651
    
    
      11
      22.777778
      23.955652
    
    
      12
      30.555556
      30.091251
    
    
      13
      26.111111
      26.000851
    
    
      14
      27.222222
      29.068651
    
    
      15
      30.555556
      32.647751
    
    
      16
      10.555556
      15.774852
    
    
      17
      32.222222
      32.136451
    
    
      18
      29.444444
      29.579951
    
    
      19
      23.333333
      18.842652
    
    
      20
      30.555556
      26.000851
    
    
      21
      43.333333
      31.113851
    
    
      22
      27.222222
      30.091251
    
    
      23
      33.333333
      33.159051
    
    
      24
      31.111111
      22.421752
    
    
      25
      30.000000
      31.113851
    
    
      26
      33.333333
      30.602551
    
    
      27
      31.111111
      30.602551
    
    
      28
      33.333333
      35.204251
    
    
      29
      36.111111
      30.091251
    
    
      ...
      ...
      ...
    
    
      23778
      4.444444
      9.639252
    
    
      23779
      31.111111
      32.136451
    
    
      23780
      30.000000
      32.136451
    
    
      23781
      34.444444
      30.091251
    
    
      23782
      28.888889
      32.647751
    
    
      23783
      32.222222
      29.068651
    
    
      23784
      25.000000
      15.774852
    
    
      23785
      26.666667
      30.091251
    
    
      23786
      35.000000
      32.136451
    
    
      23787
      15.000000
      12.195752
    
    
      23788
      19.444444
      22.421752
    
    
      23789
      15.000000
      19.865252
    
    
      23790
      33.333333
      25.489551
    
    
      23791
      27.777778
      26.000851
    
    
      23792
      31.666667
      30.091251
    
    
      23793
      31.666667
      31.625151
    
    
      23794
      29.444444
      31.113851
    
    
      23795
      29.444444
      33.670351
    
    
      23796
      16.666667
      17.820052
    
    
      23797
      20.000000
      21.399152
    
    
      23798
      28.888889
      32.136451
    
    
      23799
      31.111111
      30.091251
    
    
      23800
      44.444444
      35.715551
    
    
      23801
      36.666667
      36.226851
    
    
      23802
      29.444444
      33.159051
    
    
      23803
      32.777778
      32.136451
    
    
      23804
      32.222222
      29.068651
    
    
      23805
      31.111111
      32.647751
    
    
      23806
      31.111111
      30.602551
    
    
      23807
      36.666667
      31.625151
    
  

23808 rows × 2 columns

We can also visualize comparison result as a bar graph using the below script :

Note: As the number of records is huge, for representation purpose I’m taking just 25 records.



In [61]:

    
df1 = df.head(25)
df1.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

Though our model is not very precise, the predicted percentages are close to the actual ones.

Let's plot our straight line with the test data :



In [62]:

    
plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()



In [63]:

    
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))









    



Mean Absolute Error: 3.1993291783785445
Mean Squared Error: 17.631568097568472
Root Mean Squared Error: 4.198996082109208

Outro Estudo



In [64]:

    
# https://www.kdnuggets.com/2019/03/beginners-guide-linear-regression-python-scikit-learn.html/2



In [65]:

    
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	STA	Date	Precip	WindGustSpd	MaxTemp	MinTemp	MeanTemp	PoorWeather	YR	...	FB	FTI	ITH	PGT	TSHDSBRSGF	SD3	RHX	RHN	RVG	WTE
0	10001	1942-7-1	1.016	NaN	25.555556	22.222222	23.888889	NaN	42	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	10001	1942-7-2	0	NaN	28.888889	21.666667	25.555556	NaN	42	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	10001	1942-7-3	2.54	NaN	26.111111	22.222222	24.444444	NaN	42	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	10001	1942-7-4	2.54	NaN	26.666667	22.222222	24.444444	NaN	42	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	10001	1942-7-5	0	NaN	26.666667	21.666667	24.444444	NaN	42	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	STA	WindGustSpd	MaxTemp	MinTemp	MeanTemp	YR	MO	DA	DR	SPD	...	FT	FB	FTI	ITH	PGT	SD3	RHX	RHN	RVG	WTE
count	119040.000000	532.000000	119040.000000	119040.000000	119040.000000	119040.000000	119040.000000	119040.000000	533.000000	532.000000	...	0.0	0.0	0.0	0.0	525.000000	0.0	0.0	0.0	0.0	0.0
mean	29659.435795	37.774534	27.045111	17.789511	22.411631	43.805284	6.726016	15.797530	26.998124	20.396617	...	NaN	NaN	NaN	NaN	12.085333	NaN	NaN	NaN	NaN	NaN
std	20953.209402	10.297808	8.717817	8.334572	8.297982	1.136718	3.425561	8.794541	15.221732	5.560371	...	NaN	NaN	NaN	NaN	5.731328	NaN	NaN	NaN	NaN	NaN
min	10001.000000	18.520000	-33.333333	-38.333333	-35.555556	40.000000	1.000000	1.000000	2.000000	10.000000	...	NaN	NaN	NaN	NaN	0.000000	NaN	NaN	NaN	NaN	NaN
25%	11801.000000	29.632000	25.555556	15.000000	20.555556	43.000000	4.000000	8.000000	11.000000	16.000000	...	NaN	NaN	NaN	NaN	8.500000	NaN	NaN	NaN	NaN	NaN
50%	22508.000000	37.040000	29.444444	21.111111	25.555556	44.000000	7.000000	16.000000	32.000000	20.000000	...	NaN	NaN	NaN	NaN	11.600000	NaN	NaN	NaN	NaN	NaN
75%	33501.000000	43.059000	31.666667	23.333333	27.222222	45.000000	10.000000	23.000000	34.000000	23.250000	...	NaN	NaN	NaN	NaN	15.000000	NaN	NaN	NaN	NaN	NaN
max	82506.000000	75.932000	50.000000	34.444444	40.000000	45.000000	12.000000	31.000000	78.000000	41.000000	...	NaN	NaN	NaN	NaN	23.900000	NaN	NaN	NaN	NaN	NaN

	Actual	Predicted
0	28.888889	33.670351
1	31.111111	30.091251
2	27.222222	26.512151
3	28.888889	31.113851
4	23.333333	15.774852
5	37.222222	30.602551
6	22.222222	11.684452
7	35.555556	33.670351
8	30.555556	30.602551
9	28.888889	32.647751
10	24.444444	29.068651
11	22.777778	23.955652
12	30.555556	30.091251
13	26.111111	26.000851
14	27.222222	29.068651
15	30.555556	32.647751
16	10.555556	15.774852
17	32.222222	32.136451
18	29.444444	29.579951
19	23.333333	18.842652
20	30.555556	26.000851
21	43.333333	31.113851
22	27.222222	30.091251
23	33.333333	33.159051
24	31.111111	22.421752
25	30.000000	31.113851
26	33.333333	30.602551
27	31.111111	30.602551
28	33.333333	35.204251
29	36.111111	30.091251
...	...	...
23778	4.444444	9.639252
23779	31.111111	32.136451
23780	30.000000	32.136451
23781	34.444444	30.091251
23782	28.888889	32.647751
23783	32.222222	29.068651
23784	25.000000	15.774852
23785	26.666667	30.091251
23786	35.000000	32.136451
23787	15.000000	12.195752
23788	19.444444	22.421752
23789	15.000000	19.865252
23790	33.333333	25.489551
23791	27.777778	26.000851
23792	31.666667	30.091251
23793	31.666667	31.625151
23794	29.444444	31.113851
23795	29.444444	33.670351
23796	16.666667	17.820052
23797	20.000000	21.399152
23798	28.888889	32.136451
23799	31.111111	30.091251
23800	44.444444	35.715551
23801	36.666667	36.226851
23802	29.444444	33.159051
23803	32.777778	32.136451
23804	32.222222	29.068651
23805	31.111111	32.647751
23806	31.111111	30.602551
23807	36.666667	31.625151